#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
## ==========================================================================
## *
## *  Package WGCNA 1.66 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=4
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=4
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
library(expss)
library(polycor)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

clustering_selected = 'DynamicHybridMergedSmall'

print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% 
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 36 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #53B400 #00BD64 #C17FFF #F17E4F #E76BF3 #F365E6 #C49A00 #FD6F86 #8195FF 
##    1930    1555    1339    1253    1149    1086     977     942     856 
## #00C0BB #4B9FFF #00C094 #00BB45 #00B70C #00BF7D #D29300 #FF61C5    gray 
##     738     575     566     414     400     327     314     247     178 
## #E88523 #F8766D #00A8FF #FF64B2 #75AF00 #8EAB00 #A58AFF #A3A500 #00BECD 
##     176     160     131     128     125     120     114      99      97 
## #D774FD #00B0F6 #00B6EB #00BBDD #FF699D #B4A000 #DE8C00 #FB61D7 #00C1A8 
##      97      95      94      82      72      45      45      42      32

Module-Trait associations

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)

Module vs gene analysis

Gene significance (Gene-Diagnosis correlation)

Module-Diagnosis correlation and Gene Significance for the genes within each module make sense

plot_data = data.frame(MTcor=unique(dataset$MTcor)) %>% arrange(by=MTcor) %>% mutate(order=1:length(unique(dataset$MTcor))) %>%
            left_join(dataset, by='MTcor') %>% filter(!is.na(MTcor)) %>% dplyr::select(ID, MTcor, GS, Module, order)


ggplotly(plot_data %>% ggplot(aes(order, GS, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Gene Significance'))
ggplotly(plot_data %>% ggplot(aes(MTcor, GS)) + geom_hline(yintercept=0, color='gray', linetype='dotted') + 
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('Gene Significance'))

Log Fold Change

Module-Diagnosis correlation and log2FoldChange for the genes within each module make sense

plot_data = plot_data %>% left_join(genes_info, by='ID') %>% select(ID, order, Module, MTcor, log2FoldChange, baseMean)

ggplotly(plot_data %>% ggplot(aes(order, log2FoldChange, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + 
         theme_minimal() + xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2FoldChange'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2FoldChange)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('log2FoldChange'))

Mean expression

Modules with negative module-trait correlations seem to have higher mean expression than modules with a positive relation

ggplotly(plot_data %>% ggplot(aes(order, log2(baseMean+1), group=order)) + 
         geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2(Mean Expression)'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2(baseMean+1))) + geom_point(alpha=0.2, color=plot_data$Module, aes(id=ID)) + 
         geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') + 
         geom_smooth(color='gray', alpha=0.3) + theme_minimal() + xlab('Module-Diagnosis correlation'))

Relation to SFARI genes

There doesn’t seem to be a positive relation between Module-Diagonsis (absolute) correlation and the percentage of SFARI genes in the module (I hoped there would be)

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', alpha=0.1) +
         geom_point(color=plot_data$Module, aes(id=Module)) +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(this_row, i)

There doesn’t seem to be a relation between a module’s mean expression and the percentage of SFARI genes it contains (I would have thought there would be)

aux_data = data.frame('Module'=dataset$Module, 'meanExpr'=rowMeans(datExpr)) %>% group_by(Module) %>% 
           summarise(meanExpr=mean(meanExpr))

plot_data = plot_data %>% left_join(aux_data, by='Module')

ggplotly(plot_data %>% ggplot(aes(meanExpr, p, size=n)) + geom_smooth(color='gray', alpha=0.1) +
         geom_point(color=plot_data$Module, aes(id=Module)) + 
         theme_minimal() + theme(legend.position = 'none'))


An analysis of the genes belonging to the three modules with the highest Module-Diagonsis correlation can be found in PhD-Models/FirstPUModel/RMarkdowns/19_10_16_dataset_EA.html